library(tidyverse)
library(corrplot)
library("PerformanceAnalytics")
library(gtsummary)
library(flextable)
library(caret)
library(glmnet)
library(e1071)
food_data <- read_csv("Food_Supply_kcal_Data.csv")
head(food_data)
## # A tibble: 6 x 32
## Country `Alcoholic Beve… `Animal Product… `Animal fats` `Aquatic Produc…
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 Afghan… 0 4.78 0.850 0
## 2 Albania 0.912 16.1 1.06 0
## 3 Algeria 0.0896 6.03 0.194 0
## 4 Angola 1.94 4.69 0.264 0
## 5 Antigu… 2.30 15.4 1.54 0
## 6 Argent… 1.44 15.0 1.06 0
## # … with 27 more variables: `Cereals - Excluding Beer` <dbl>, Eggs <dbl>,
## # `Fish, Seafood` <dbl>, `Fruits - Excluding Wine` <dbl>, Meat <dbl>, `Milk -
## # Excluding Butter` <dbl>, Miscellaneous <dbl>, Offals <dbl>, Oilcrops <dbl>,
## # Pulses <dbl>, Spices <dbl>, `Starchy Roots` <dbl>, Stimulants <dbl>, `Sugar
## # Crops` <dbl>, `Sugar & Sweeteners` <dbl>, Treenuts <dbl>, `Vegetal
## # Products` <dbl>, `Vegetable Oils` <dbl>, Vegetables <dbl>, Obesity <dbl>,
## # Undernourished <chr>, Confirmed <dbl>, Deaths <dbl>, Recovered <dbl>,
## # Active <dbl>, Population <dbl>, `Unit (all except Population)` <chr>
sum(food_data[1,2:24])
## [1] 100.0002
food_data <- food_data %>%
drop_na() %>%
mutate(Confirmed_per_capita = Confirmed/Population * 100000000, Deaths_per_capita = Deaths/Population * 100000000) %>%
#creating a column that specifies whether each country falls above or below the median
mutate(Cases_vs_median = as.character(ifelse(Confirmed_per_capita > median(Confirmed_per_capita), "Above", "Below")),
Deaths_vs_median = as.character(ifelse(Deaths_per_capita > median(Deaths_per_capita), "Above", "Below")))
food_data <- food_data %>%
select(-Miscellaneous, -Obesity, -Undernourished, -Recovered, -Active, -Population, -`Unit (all except Population)`,
-Confirmed, -Deaths)
head(food_data)
## # A tibble: 6 x 27
## Country `Alcoholic Beve… `Animal Product… `Animal fats` `Aquatic Produc…
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 Afghan… 0 4.78 0.850 0
## 2 Albania 0.912 16.1 1.06 0
## 3 Algeria 0.0896 6.03 0.194 0
## 4 Angola 1.94 4.69 0.264 0
## 5 Argent… 1.44 15.0 1.06 0
## 6 Armenia 0.227 12.8 1.77 0
## # … with 22 more variables: `Cereals - Excluding Beer` <dbl>, Eggs <dbl>,
## # `Fish, Seafood` <dbl>, `Fruits - Excluding Wine` <dbl>, Meat <dbl>, `Milk -
## # Excluding Butter` <dbl>, Offals <dbl>, Oilcrops <dbl>, Pulses <dbl>,
## # Spices <dbl>, `Starchy Roots` <dbl>, Stimulants <dbl>, `Sugar Crops` <dbl>,
## # `Sugar & Sweeteners` <dbl>, Treenuts <dbl>, `Vegetal Products` <dbl>,
## # `Vegetable Oils` <dbl>, Vegetables <dbl>, Confirmed_per_capita <dbl>,
## # Deaths_per_capita <dbl>, Cases_vs_median <chr>, Deaths_vs_median <chr>
corrplot(is.corr=TRUE, cor(food_data[,2:24], use="pairwise.complete.obs"), method="number")
chart.Correlation(food_data[,2:24], histogram=TRUE)
# summary statistics
food_data_1 <- food_data %>%
select(-Deaths_vs_median, -Country)
tbl_summary(data = food_data_1, by = "Cases_vs_median",
statistic = list(all_continuous() ~ "{mean} ({sd})",
all_categorical() ~ "{n}({p}%)")) %>%
add_n() %>%
add_p(test = all_continuous() ~ "aov") %>%
as_flex_table() %>%
bold(part = "header")
Characteristic | N | Above, N = 771 | Below, N = 771 | p-value2 |
Alcoholic Beverages | 154 | 1.76 (1.15) | 0.86 (0.78) | <0.001 |
Animal Products | 154 | 11.5 (4.1) | 6.6 (4.2) | <0.001 |
Animal fats | 154 | 1.77 (1.50) | 0.76 (0.85) | <0.001 |
Aquatic Products, Other | 154 | 0.12 | ||
0 | 77(100%) | 73(95%) | ||
0.0185 | 0(0%) | 1(1.3%) | ||
0.0209 | 0(0%) | 1(1.3%) | ||
0.0336 | 0(0%) | 1(1.3%) | ||
0.4007 | 0(0%) | 1(1.3%) | ||
Cereals - Excluding Beer | 154 | 18 (5) | 23 (7) | <0.001 |
Eggs | 154 | 0.52 (0.26) | 0.32 (0.32) | <0.001 |
Fish, Seafood | 154 | 0.61 (0.60) | 0.57 (0.50) | 0.7 |
Fruits - Excluding Wine | 154 | 2.21 (1.46) | 1.80 (1.43) | 0.082 |
Meat | 154 | 4.44 (1.68) | 2.93 (2.25) | <0.001 |
Milk - Excluding Butter | 154 | 3.99 (1.84) | 1.89 (1.72) | <0.001 |
Offals | 154 | 0.15 (0.09) | 0.14 (0.13) | 0.6 |
Oilcrops | 154 | 0.66 (0.83) | 1.43 (1.85) | 0.001 |
Pulses | 154 | 0.79 (0.73) | 1.48 (1.52) | <0.001 |
Spices | 154 | 0.18 (0.24) | 0.19 (0.25) | >0.9 |
Starchy Roots | 154 | 2.2 (2.4) | 4.3 (5.0) | 0.001 |
Stimulants | 154 | 0.46 (0.36) | 0.15 (0.16) | <0.001 |
Sugar Crops | 154 | 0.00 (0.01) | 0.04 (0.10) | 0.005 |
Sugar & Sweeteners | 154 | 5.62 (1.63) | 3.92 (2.22) | <0.001 |
Treenuts | 154 | 0.32 (0.29) | 0.22 (0.29) | 0.035 |
Vegetal Products | 154 | 38.5 (4.1) | 43.4 (4.2) | <0.001 |
Vegetable Oils | 154 | 5.09 (2.17) | 4.66 (2.19) | 0.2 |
Vegetables | 154 | 1.20 (0.63) | 0.93 (0.63) | 0.008 |
Confirmed_per_capita | 154 | 161 (282) | 1 (2) | <0.001 |
Deaths_per_capita | 154 | 2.21 (3.74) | 0.03 (0.05) | <0.001 |
1Mean (SD); n(%) | ||||
2One-way ANOVA; Fisher's exact test | ||||
food_data_2 <- food_data %>%
select(-Cases_vs_median, -Country)
tbl_summary(data = food_data_2, by = "Deaths_vs_median",
statistic = list(all_continuous() ~ "{mean} ({sd})",
all_categorical() ~ "{n}({p}%)")) %>%
add_n() %>%
add_p(test = all_continuous() ~ "aov") %>%
as_flex_table() %>%
bold(part = "header")
Characteristic | N | Above, N = 771 | Below, N = 771 | p-value2 |
Alcoholic Beverages | 154 | 1.75 (1.15) | 0.88 (0.80) | <0.001 |
Animal Products | 154 | 11.4 (4.2) | 6.7 (4.2) | <0.001 |
Animal fats | 154 | 1.78 (1.50) | 0.75 (0.83) | <0.001 |
Aquatic Products, Other | 154 | 0.12 | ||
0 | 77(100%) | 73(95%) | ||
0.0185 | 0(0%) | 1(1.3%) | ||
0.0209 | 0(0%) | 1(1.3%) | ||
0.0336 | 0(0%) | 1(1.3%) | ||
0.4007 | 0(0%) | 1(1.3%) | ||
Cereals - Excluding Beer | 154 | 18 (5) | 23 (7) | <0.001 |
Eggs | 154 | 0.53 (0.26) | 0.31 (0.32) | <0.001 |
Fish, Seafood | 154 | 0.60 (0.60) | 0.58 (0.50) | 0.8 |
Fruits - Excluding Wine | 154 | 2.08 (1.37) | 1.94 (1.54) | 0.5 |
Meat | 154 | 4.38 (1.74) | 2.99 (2.24) | <0.001 |
Milk - Excluding Butter | 154 | 3.98 (1.84) | 1.90 (1.73) | <0.001 |
Offals | 154 | 0.14 (0.08) | 0.14 (0.14) | >0.9 |
Oilcrops | 154 | 0.68 (0.94) | 1.42 (1.81) | 0.002 |
Pulses | 154 | 0.78 (0.68) | 1.49 (1.54) | <0.001 |
Spices | 154 | 0.17 (0.23) | 0.20 (0.25) | 0.5 |
Starchy Roots | 154 | 2.0 (2.3) | 4.5 (5.0) | <0.001 |
Stimulants | 154 | 0.46 (0.36) | 0.15 (0.17) | <0.001 |
Sugar Crops | 154 | 0.00 (0.01) | 0.04 (0.10) | 0.004 |
Sugar & Sweeteners | 154 | 5.67 (1.65) | 3.86 (2.16) | <0.001 |
Treenuts | 154 | 0.31 (0.28) | 0.22 (0.30) | 0.045 |
Vegetal Products | 154 | 38.6 (4.2) | 43.3 (4.2) | <0.001 |
Vegetable Oils | 154 | 5.24 (2.20) | 4.52 (2.12) | 0.041 |
Vegetables | 154 | 1.19 (0.64) | 0.94 (0.62) | 0.013 |
Confirmed_per_capita | 154 | 157 (283) | 5 (27) | <0.001 |
Deaths_per_capita | 154 | 2.21 (3.74) | 0.03 (0.04) | <0.001 |
1Mean (SD); n(%) | ||||
2One-way ANOVA; Fisher's exact test | ||||
# prepare data for cross validation
food_data <- food_data %>%
# delete country since only each country has only one observation
select(-`Animal Products`, -Country,-`Vegetal Products`)
food_data %>% head()
## # A tibble: 6 x 24
## `Alcoholic Beve… `Animal fats` `Aquatic Produc… `Cereals - Excl… Eggs
## <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0 0.850 0 37.1 0.150
## 2 0.912 1.06 0 16.2 0.809
## 3 0.0896 0.194 0 25.0 0.418
## 4 1.94 0.264 0 18.4 0.0441
## 5 1.44 1.06 0 16.8 0.864
## 6 0.227 1.77 0 19.3 0.731
## # … with 19 more variables: `Fish, Seafood` <dbl>, `Fruits - Excluding
## # Wine` <dbl>, Meat <dbl>, `Milk - Excluding Butter` <dbl>, Offals <dbl>,
## # Oilcrops <dbl>, Pulses <dbl>, Spices <dbl>, `Starchy Roots` <dbl>,
## # Stimulants <dbl>, `Sugar Crops` <dbl>, `Sugar & Sweeteners` <dbl>,
## # Treenuts <dbl>, `Vegetable Oils` <dbl>, Vegetables <dbl>,
## # Confirmed_per_capita <dbl>, Deaths_per_capita <dbl>, Cases_vs_median <chr>,
## # Deaths_vs_median <chr>
library(GGally)
# a panel of scatterplots using `GGally`
ggpairs(data = food_data[,c(1:20,24)],
ggplot2::aes(colour = Deaths_vs_median, alpha =0.5))
food_data <- food_data %>%
mutate(Cases_vs_median = as.factor(ifelse(Cases_vs_median == "Above",1,0)),
Deaths_vs_median = as.factor(ifelse(Deaths_vs_median == "Above",1,0)))
# categorical -> predict deaths
library(rBayesianOptimization)
set.seed(123)
cv_folds_10 = createFolds(y=food_data$Deaths_vs_median, k=10)
result_logistic <- list()
result_knn <- list()
result_linear_svm <- list()
result_radial_svm <- list()
result_polynomial_svm <- list()
result_qda <- list()
result_lasso <- list()
result_ridge <- list()
food_data %>% head()
## # A tibble: 6 x 24
## `Alcoholic Beve… `Animal fats` `Aquatic Produc… `Cereals - Excl… Eggs
## <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0 0.850 0 37.1 0.150
## 2 0.912 1.06 0 16.2 0.809
## 3 0.0896 0.194 0 25.0 0.418
## 4 1.94 0.264 0 18.4 0.0441
## 5 1.44 1.06 0 16.8 0.864
## 6 0.227 1.77 0 19.3 0.731
## # … with 19 more variables: `Fish, Seafood` <dbl>, `Fruits - Excluding
## # Wine` <dbl>, Meat <dbl>, `Milk - Excluding Butter` <dbl>, Offals <dbl>,
## # Oilcrops <dbl>, Pulses <dbl>, Spices <dbl>, `Starchy Roots` <dbl>,
## # Stimulants <dbl>, `Sugar Crops` <dbl>, `Sugar & Sweeteners` <dbl>,
## # Treenuts <dbl>, `Vegetable Oils` <dbl>, Vegetables <dbl>,
## # Confirmed_per_capita <dbl>, Deaths_per_capita <dbl>, Cases_vs_median <fct>,
## # Deaths_vs_median <fct>
food_data$Cases_vs_median <- as_factor(food_data$Cases_vs_median)
food_data$Deaths_vs_median <- as_factor(food_data$Deaths_vs_median)
for(f in 1:length(cv_folds_10)){
food_data_train_cv_10 <- food_data[-cv_folds_10[[f]],]
food_data_test_cv_10 <- food_data[cv_folds_10[[f]],]
# knn
set.seed(2342)
knn_fit <- train(Deaths_vs_median~.-Confirmed_per_capita -Deaths_per_capita -Cases_vs_median, data = food_data_train_cv_10,
method = "knn", tuneLength = 20, preProcess = c("scale", "center"),
trControl = trainControl(method="cv", number = 5))
# logistic
logistic_fit <- glm(Deaths_vs_median~.-Confirmed_per_capita -Deaths_per_capita -Cases_vs_median, data = food_data_train_cv_10, family = binomial())
# SVM - linear
set.seed(2342)
svr_linear_tune <- tune(svm, Deaths_vs_median~.-Confirmed_per_capita -Deaths_per_capita -Cases_vs_median, data = food_data_train_cv_10, kernel ="linear", ranges=list(cost=1:3, epsilon=c(0.1,0.5,1)))
linear_svm_fit <- svr_linear_tune$best.model
# SVM - radial
set.seed(2342)
svr_radial_tune <- tune(svm, Deaths_vs_median~.-Confirmed_per_capita -Deaths_per_capita -Cases_vs_median, data = food_data_train_cv_10, kernel ="radial", ranges=list(gamma=c(0.001, 0.05, 0.1), cost=1:3, epsilon=c(0.1,0.5,1)))
radial_svm_fit <- svr_radial_tune$best.model
# SVM - polynomial
set.seed(2342)
svr_polynomial_tune <- tune(svm, Deaths_vs_median~.-Confirmed_per_capita -Deaths_per_capita -Cases_vs_median, data = food_data_train_cv_10, kernel ="polynomial", ranges=list(gamma=c(0.001, 0.05, 0.1), cost=1:3, d=seq(2,3)))
polynomial_svm_fit <- svr_polynomial_tune$best.model
# QDA
frmla <-as.formula(paste0(names(food_data_train_cv_10)[24], "~", paste0("`", names(food_data_train_cv_10)[-c(3,21:24)], "`", collapse="+")))
qda_fit <- train(frmla, data = food_data_train_cv_10, method = "qda")
# lasso & ridge
lasso_fit = cv.glmnet(x = data.matrix(food_data[,c(1:20)]), y = data.matrix(food_data[,24]), alpha=1, family="binomial")
ridge_fit = cv.glmnet(x = data.matrix(food_data[,c(1:20)]), y = data.matrix(food_data[,24]), alpha=0, family="binomial")
# knn
food_data_test_cv_10$Deaths_knn_predict <- predict(knn_fit, newdata = food_data_test_cv_10)
result_knn[[f]] <- 1-confusionMatrix(data = food_data_test_cv_10$Deaths_knn_predict, reference = as.factor(food_data_test_cv_10$Deaths_vs_median))$overall[1]
food_data_test_cv_10 <- food_data_test_cv_10 %>%
select(-Deaths_knn_predict)
#logistic
food_data_test_cv_10$Deaths_logistic_predict_num <- predict(logistic_fit, newdata = food_data_test_cv_10, type="response", na.action=na.exclude)
food_data_test_cv_10$Deaths_logistic_predict <- as_factor(ifelse(food_data_test_cv_10$Deaths_logistic_predict_num >= 0.5, "1", "0"))
result_logistic[[f]] <- 1- confusionMatrix(data = food_data_test_cv_10$Deaths_logistic_predict, reference = as.factor(food_data_test_cv_10$Deaths_vs_median))$overall[1]
food_data_test_cv_10 <- food_data_test_cv_10 %>%
select(-Deaths_logistic_predict, -Deaths_logistic_predict_num)
# linear svm
food_data_test_cv_10$Deaths_linear_svm_predict <- predict(linear_svm_fit, newdata = food_data_test_cv_10)
result_linear_svm[[f]] <- 1- confusionMatrix(data = food_data_test_cv_10$Deaths_linear_svm_predict, reference = as.factor(food_data_test_cv_10$Deaths_vs_median))$overall[1]
food_data_test_cv_10 <- food_data_test_cv_10 %>%
select(-Deaths_linear_svm_predict)
# radial svm
food_data_test_cv_10$Deaths_radial_svm_predict <- predict(radial_svm_fit, newdata = food_data_test_cv_10)
result_radial_svm[[f]] <- 1- confusionMatrix(data = food_data_test_cv_10$Deaths_radial_svm_predict, reference = as.factor(food_data_test_cv_10$Deaths_vs_median))$overall[1]
food_data_test_cv_10 <- food_data_test_cv_10 %>%
select(-Deaths_radial_svm_predict)
# polynomial svm
food_data_test_cv_10$Deaths_polynomial_svm_predict <- predict(polynomial_svm_fit, newdata = food_data_test_cv_10)
result_polynomial_svm[[f]] <- 1- confusionMatrix(data = food_data_test_cv_10$Deaths_polynomial_svm_predict, reference = as.factor(food_data_test_cv_10$Deaths_vs_median))$overall[1]
food_data_test_cv_10 <- food_data_test_cv_10 %>%
select(-Deaths_polynomial_svm_predict)
# qda
food_data_test_cv_10$Deaths_qda_predict_num <- predict(qda_fit, newdata = food_data_test_cv_10, type = "prob", na.action=na.exclude)$`1`
food_data_test_cv_10$Deaths_qda_predict = relevel(factor(ifelse(food_data_test_cv_10$Deaths_qda_predict_num > 0.5, "1", "0")),
ref = "0")
result_qda[[f]] <- 1- confusionMatrix(data = food_data_test_cv_10$Deaths_qda_predict,
reference = as.factor(food_data_test_cv_10$Deaths_vs_median))$overall[1]
food_data_test_cv_10 <- food_data_test_cv_10 %>%
select(-Deaths_qda_predict, -Deaths_qda_predict_num)
# Lasso
# probability of being in class "1"
food_data_test_cv_10$Deaths_lasso_predict_num <- predict(lasso_fit, newx=data.matrix(food_data_test_cv_10[,c(1:20)]), type="response")
food_data_test_cv_10$Deaths_lasso_predict <- as_factor(ifelse(food_data_test_cv_10$Deaths_lasso_predict_num >= 0.5, "1", "0"))
result_lasso[[f]] <- 1- confusionMatrix(data = food_data_test_cv_10$Deaths_lasso_predict, reference = as.factor(food_data_test_cv_10$Deaths_vs_median))$overall[1]
food_data_test_cv_10 <- food_data_test_cv_10 %>%
select(-Deaths_lasso_predict, -Deaths_lasso_predict_num)
# Ridge
food_data_test_cv_10$Deaths_ridge_predict_num <- predict(ridge_fit, newx=data.matrix(food_data_test_cv_10[,c(1:20)]), type="response")
food_data_test_cv_10$Deaths_ridge_predict <- as_factor(ifelse(food_data_test_cv_10$Deaths_ridge_predict_num >= 0.5, "1", "0"))
result_ridge[[f]] <- 1- confusionMatrix(data = food_data_test_cv_10$Deaths_ridge_predict,
reference = as.factor(food_data_test_cv_10$Deaths_vs_median))$overall[1]
food_data_test_cv_10 <- food_data_test_cv_10 %>%
select(-Deaths_ridge_predict, -Deaths_ridge_predict_num)
}
data.frame_knn = data.frame("CV_error" = mean(unlist(result_knn)), "CV_error_SE" = sd(unlist(result_knn)))
data.frame_logistic = data.frame("CV_error" = mean(unlist(result_logistic)), "CV_error_SE" = sd(unlist(result_logistic)))
data.frame_linear_svm = data.frame("CV_error" = mean(unlist(result_linear_svm)), "CV_error_SE" = sd(unlist(result_linear_svm)))
data.frame_radial_svm = data.frame("CV_error" = mean(unlist(result_radial_svm)), "CV_error_SE" = sd(unlist(result_radial_svm)))
data.frame_polynomial_svm = data.frame("CV_error" = mean(unlist(result_polynomial_svm)), "CV_error_SE" = sd(unlist(result_polynomial_svm)))
data.frame_qda = data.frame("CV_error" = mean(unlist(result_qda)), "CV_error_SE" = sd(unlist(result_qda)))
data.frame_lasso = data.frame("CV_error" = mean(unlist(result_lasso)), "CV_error_SE" = sd(unlist(result_lasso)))
data.frame_ridge = data.frame("CV_error" = mean(unlist(result_ridge)), "CV_error_SE" = sd(unlist(result_ridge)))
summary_table <-
rbind("KNN" = data.frame_knn, "Logistic Regression" = data.frame_logistic, "Linear SVM" = data.frame_linear_svm,
"Radial SVM" = data.frame_radial_svm, "Polynomial SVM" = data.frame_polynomial_svm, "QDA" = data.frame_qda,
"Lasso" = data.frame_lasso, "Ridge" = data.frame_ridge)
flextable(summary_table %>% rownames_to_column("Algorithm Name"))
Algorithm Name | CV_error | CV_error_SE |
KNN | 0.25 | 0.093 |
Logistic Regression | 0.23 | 0.109 |
Linear SVM | 0.23 | 0.101 |
Radial SVM | 0.24 | 0.115 |
Polynomial SVM | 0.29 | 0.116 |
QDA | 0.23 | 0.095 |
Lasso | 0.20 | 0.086 |
Ridge | 0.21 | 0.090 |
library(randomForest)
# random forest
names(food_data) <- make.names(names(food_data))
food_data %>% head()
## # A tibble: 6 x 24
## Alcoholic.Bever… Animal.fats Aquatic.Product… Cereals...Exclu… Eggs
## <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0 0.850 0 37.1 0.150
## 2 0.912 1.06 0 16.2 0.809
## 3 0.0896 0.194 0 25.0 0.418
## 4 1.94 0.264 0 18.4 0.0441
## 5 1.44 1.06 0 16.8 0.864
## 6 0.227 1.77 0 19.3 0.731
## # … with 19 more variables: Fish..Seafood <dbl>, Fruits...Excluding.Wine <dbl>,
## # Meat <dbl>, Milk...Excluding.Butter <dbl>, Offals <dbl>, Oilcrops <dbl>,
## # Pulses <dbl>, Spices <dbl>, Starchy.Roots <dbl>, Stimulants <dbl>,
## # Sugar.Crops <dbl>, Sugar...Sweeteners <dbl>, Treenuts <dbl>,
## # Vegetable.Oils <dbl>, Vegetables <dbl>, Confirmed_per_capita <dbl>,
## # Deaths_per_capita <dbl>, Cases_vs_median <fct>, Deaths_vs_median <fct>
set.seed(123)
cv_folds_10 = createFolds(y=food_data$Deaths_vs_median, k=10)
result <- list()
result_rf <- list()
n_list <- c(50, 250, 500)
p <- ncol(food_data) - 4
c(p/2, sqrt(p), p)
## [1] 10.000000 4.472136 20.000000
m_list <- c(10, 4, 20)
reg_rf_tune <- list()
reg_rf_oob_errors <- list()
round = 0
for(f in 1:length(cv_folds_10)){
food_data_train_cv_10 <- food_data[-cv_folds_10[[f]],]
food_data_test_cv_10 <- food_data[cv_folds_10[[f]],]
for (i in 1:length(n_list)) {
for (j in 1:length(m_list)) {
round <- round + 1
set.seed(7812)
reg_rf_tune[[round]] <- randomForest(Deaths_vs_median~.-Confirmed_per_capita -Deaths_per_capita -Cases_vs_median,
data = food_data_train_cv_10, ntree=n_list[i], mtry=m_list[j])
head(food_data_train_cv_10)
reg_rf_oob_errors[[round]] <- data.frame("trees_no"=n_list[i], "preds_no"=m_list[j],
"oob_errors"=reg_rf_tune[[round]]$err.rate[n_list[i],1])
}
}
reg_rf_oob_errors_df <- do.call("rbind", reg_rf_oob_errors)
best_errors <- which(reg_rf_oob_errors_df$oob_errors==min(reg_rf_oob_errors_df$oob_errors))
set.seed(9984)
reg_rf <- randomForest(Deaths_vs_median~.-Confirmed_per_capita -Deaths_per_capita -Cases_vs_median, data = food_data_train_cv_10,
ntree=reg_rf_oob_errors_df$trees_no[best_errors[1]], mtry=reg_rf_oob_errors_df$preds_no[best_errors[1]])
Deaths_Predict_Prob <- predict(reg_rf, newdata = food_data_test_cv_10, type = "prob")
food_data_test_cv_10$Deaths_rf_Predict <- colnames(Deaths_Predict_Prob)[max.col(Deaths_Predict_Prob, ties.method="first")]
per_class_accuracy <- rep(NA,length(levels(food_data_test_cv_10$Deaths_vs_median)))
for (i in 1:length(per_class_accuracy)){
per_class_accuracy[i] = food_data_test_cv_10 %>%
filter(Deaths_vs_median == levels(Deaths_vs_median)[i]) %>%
summarise(accuracy = sum(Deaths_rf_Predict ==
levels(Deaths_vs_median)[i]) / n()) %>%
unlist()
names(per_class_accuracy)[i] <- paste0("accuracy_",
levels(food_data_test_cv_10$Deaths_vs_median)[i])
}
result[[f]] <- per_class_accuracy
result_rf[[f]] <- 1- confusionMatrix(data = as.factor(food_data_test_cv_10$Deaths_rf_Predict),
reference = as.factor(food_data_test_cv_10$Deaths_vs_median))$overall[1]
}
df = as.data.frame(matrix(unlist(result), ncol = 2, byrow = TRUE))
CV_error = c("Below" = (1-mean(df[,1], na.rm=TRUE)),
"Above" = (1-mean(df[,2], na.rm=TRUE)))
CV_error_SE = c("Below" = sd((1-df[,1]), na.rm=TRUE),
"Above" = sd((1-df[,2]), na.rm=TRUE))
data.frame(CV_error, CV_error_SE)
## CV_error CV_error_SE
## Below 0.2000000 0.2000567
## Above 0.1678571 0.0883683
rf <- data.frame("CV Error" = CV_error, "CV Error SE" = CV_error_SE)
flextable(rf %>% rownames_to_column("Class Name"))
Class Name | CV.Error | CV.Error.SE |
Below | 0.20 | 0.200 |
Above | 0.17 | 0.088 |
data.frame_rf <- data.frame("CV_error" = mean(unlist(result_rf)), "CV_error_SE" = sd(unlist(result_rf)))
summary_table <- rbind(summary_table, "Random Forest" = data.frame_rf)
flextable(summary_table %>% rownames_to_column("Algorithm Name"))
Algorithm Name | CV_error | CV_error_SE |
KNN | 0.25 | 0.093 |
Logistic Regression | 0.23 | 0.109 |
Linear SVM | 0.23 | 0.101 |
Radial SVM | 0.24 | 0.115 |
Polynomial SVM | 0.29 | 0.116 |
QDA | 0.23 | 0.095 |
Lasso | 0.20 | 0.086 |
Ridge | 0.21 | 0.090 |
Random Forest | 0.18 | 0.132 |